This tutorial is made to introduce Keras at the Machine Learning Study meeting.
The following networks will be examplified:
Original Japanese Notebook github.
If pyenv is already installed, run the following command:
$ pyenv install anaconda3-4.0.0; pip install keras theano gensim pydot-ng git+https://github.com/dfm/daft.git
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rc
%matplotlib inline
plt.style.use("ggplot")
import daft
from gensim.models.doc2vec import Word2Vec
import json
from IPython.display import SVG, display
import numpy as np
np.random.seed(13)
from keras import backend as K
from keras.models import Model, Sequential, model_from_json
from keras.datasets import mnist, cifar10, imdb
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, LSTM, GRU, Input, RepeatVector, TimeDistributed, Merge, Lambda, Reshape
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import Adadelta
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.visualize_util import model_to_dot, plot
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, base_filter
# visualization
def draw_digit(data, row, col, n):
plt.subplot(row, col, n)
plt.imshow(data)
plt.gray()
pgm = daft.PGM(shape=[5, 5])
# Nodes
for i in range(5):
pgm.add_node(daft.Node("x"+str(i), "", 1, i+0.5))
for h in range(4):
pgm.add_node(daft.Node("h"+str(h), "", 2.5, h+0.85))
for o in range(5):
pgm.add_node(daft.Node("o"+str(o), "", 4.0, o+0.5))
# Edges
for i in range(5):
for h in range(4):
pgm.add_edge("x"+str(i), "h"+str(h))
for h in range(4):
for o in range(5):
pgm.add_edge("h"+str(h), "o"+str(o))
pgm.render()
The MNIST and CIFAR10 datasets are integrated into Keras. Using the Keras pre-defined function 'load_data', the dataset will be downloaded into the datasets directory (~/.keras/datasets/) automatically. This avoids download the datasets repeatedly.
# Load Data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
nb_classes = 10 # class size
input_unit_size = 28*28 # input vector size
show_size = 10
total = 0
plt.figure(figsize=(20,20))
for i in range(show_size):
for j in range(show_size):
draw_digit(X_train[total], show_size, show_size, total+1)
total+=1
plt.show()
# Preprocessing
X_train = X_train.reshape(X_train.shape[0], input_unit_size)
X_test = X_test.reshape(X_test.shape[0], input_unit_size)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
# Convert labels to categorical one-hot encoding
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
In Keras there are 2 ways to define the model:
Sequential: The Sequential model is a linear stack of layers.FunctionalAPI: The Keras functional API is the way to go for defining complex models, such as multi-output models, directed acyclic graphs, or models with shared layers.The former is sufficient for this MNIST example.
Model definition:
reLU functionsoftmax function Sequential Model¶Sequential model and specify the input shape: the first layer in a Sequential model (and only the first, because following layers can do automatic shape inference) needs to receive information about its input shape.add to define intermediate layers, activation functions and dropoutsmodel = Sequential()
model.add(Dense(128, input_dim=input_unit_size, init='glorot_uniform'))
model.add(Activation("relu"))
model.add(Dropout(p=0.2))
model.add(Dense(nb_classes, init='glorot_uniform'))
model.add(Activation('softmax'))
FunctionalAPI Model¶# This returns a tensor
inputs = Input(shape=(input_unit_size,))
# a layer instance is callable on a tensor, and returns a tensor
x = Dense(128, activation='relu')(inputs)
x = Dropout(0.2)(x)
outputs = Dense(nb_classes, activation="softmax")(x)
# This creates a model that includes
# the Input layer, Dense and Dropout layers
model = Model(input=inputs, output=outputs)
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
rmsprop or adagrad), or an instance of the Optimizer class. See: optimizers.SGD. Otherwise default parameters are used. optimizer = Adadelta()
Compilation with compile:
categorical_crossentropy for a multi-class classification problem, binary_crossentropy for a binary classification problem or mse for a mean squared error regression problem), or it can be an objective custom function. See: losses.metrics=['accuracy']. A metric could be the string identifier of an existing metric or a custom metric function.Training with fit:
fit function.fit function in scikit-learn# Compilation
# For a single-input multi-class (nb_classes) classification problem
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
nb_epoch = 5
# Train the model, iterating on the data in batches of 256 samples
result = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=256, verbose=2, validation_split=0.2)
# Prediction with test data
score = model.evaluate(X_test, Y_test, batch_size=256)
print("test loss:", score[0])
print("test accu:", score[1])
loss and accuracy for each epoch¶After the training, a History object is returned. Besides the parameters of the model, loss and accuracy metrics for each epoch are recorded.
x = range(nb_epoch)
plt.plot(x, result.history['acc'], label="train acc")
plt.plot(x, result.history['val_acc'], label="val acc")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
plt.plot(x, result.history['loss'], label="train loss")
plt.plot(x, result.history['val_loss'], label="val loss")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
A Keras model can be saved as json or yaml file. The parameters of the trained model is stored in HDF5 format.
Steps to load a stored model:
compile the model object# save model without weights
with open('mnist_model.json', 'w') as f:
json.dump(model.to_json(), f)
model.save_weights('mnist_weights.h5')
# load model
mnist_model = model_from_json(json.load(open("mnist_model.json")))
# load wights
mnist_model.load_weights("./mnist_weights.h5")
mnist_model.compile(loss='categorical_crossentropy', optimizer='adadelta')
Machine Learning Steps using Keras:
The next example uses again the MNIST dataset to demonastrate the CNN technique.
The input image for the CNN model uses a tensor dimension of $(channels, rows, columns)$. Since the MNIST dataset is grayscale (1 color channel), it is actually $(1, 28, 28)$.
img_rows, img_cols = 28, 28
nb_classes = 10
nb_filters = 10 # the number of filters
nb_pool = 2 # window size of pooling
nb_conv = 3 # window size of filter
# data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
Same as in MLP, the convolutional and pooling layers can be inserted by the add function.
model = Sequential()
model.add(Convolution2D(nb_filters, nb_conv, nb_conv, input_shape=(1, img_rows, img_cols)))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, nb_conv, nb_conv))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer="adadelta",
metrics=['accuracy'])
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
nb_epoch = 5
cnn_result = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=256, verbose=2, validation_split=0.2)
Let's compare with MLP results.
x = range(nb_epoch)
plt.plot(x, result.history['acc'], label="train acc")
plt.plot(x, result.history['val_acc'], label="val acc")
plt.plot(x, cnn_result.history['acc'], label="cnn train acc")
plt.plot(x, cnn_result.history['val_acc'], label="cnn val acc")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
plt.plot(x, result.history['loss'], label="train loss")
plt.plot(x, result.history['val_loss'], label="val loss")
plt.plot(x, cnn_result.history['loss'], label="cnn train loss")
plt.plot(x, cnn_result.history['val_loss'], label="cnn val loss")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
In Keras, the inputs and outputs of each layer are actually functions. We can visualize their intermediate results as images.
model.layers
show_size = 10
plt.figure(figsize=(20,20))
for i in range(show_size):
draw_digit(X_train[i].reshape(28,28), 1, show_size, i+1)
plt.show()
Visualization of the first convolution layer through a relu function.
get_first_layer_output = K.function([model.layers[0].input],
[model.layers[1].output])
first_layer = get_first_layer_output([X_train[0:show_size]])[0]
plt.figure(figsize=(20,20))
for img_index, filters in enumerate(first_layer, start=1):
for filter_index, mat in enumerate(filters):
pos = (filter_index)*10+img_index
draw_digit(mat, nb_filters, show_size, pos)
plt.show()
get_second_layer_output = K.function([model.layers[0].input],
[model.layers[3].output])
second_output = get_second_layer_output([X_train[0:show_size]])[0]
show_size = 10
plt.figure(figsize=(20,20))
print(second_output.shape)
for img_index, filters in enumerate(second_output, start=1):
for filter_index, mat in enumerate(filters):
pos = (filter_index)*10+img_index
draw_digit(mat, nb_filters, show_size, pos)
plt.show()
pooling¶Visualization of the result through the maxpooling layer.
get_3rd_layer_output = K.function([model.layers[0].input],
[model.layers[4].output])
layers = get_3rd_layer_output([X_train[0:show_size]])[0]
show_size = 10
plt.figure(figsize=(20,20))
for img_index, filters in enumerate(layers, start=1):
for filter_index, mat in enumerate(filters):
pos = (filter_index)*10+img_index
draw_digit(mat, nb_filters, show_size, pos)
plt.show()
In this example we'll experiment with the CIFAR10 color images using CNN. A tensor image has the dimension: $(channels=3, rows=32, columns=32)$.
batch_size = 256
nb_classes = 10
nb_epoch = 5
nb_filter = 10
img_rows, img_cols = 32, 32
img_channels = 3
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
model = Sequential()
model.add(Convolution2D(nb_filter, 3, 3, input_shape=(img_channels, img_rows, img_cols)))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filter, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, Y_test))
# visualization
def draw_color(data, row, col, n):
plt.subplot(row, col, n)
plt.imshow(data)
show_size = 10
plt.figure(figsize=(20,20))
for i in range(show_size):
draw_color(X_train[i].reshape(3, 32, 32).transpose(1, 2, 0), 1, show_size, i+1)
plt.show()
plt.figure(figsize=(20,20))
layer_output = K.function([model.layers[0].input],
[model.layers[1].output])
layers = layer_output([X_train[0:show_size]])[0]
for img_index, filters in enumerate(layers, start=1):
for filter_index, mat in enumerate(filters):
pos = (filter_index)*show_size+img_index
draw_color(mat, nb_filter, show_size, pos)
plt.show()
plt.figure(figsize=(20,20))
layer_output = K.function([model.layers[0].input],
[model.layers[3].output])
layers = layer_output([X_train[0:show_size]])[0]
for img_index, filters in enumerate(layers, start=1):
for filter_index, mat in enumerate(filters):
pos = (filter_index)*show_size+img_index
draw_color(mat, nb_filter, show_size, pos)
plt.show()
plt.figure(figsize=(20,20))
layer_output = K.function([model.layers[0].input],
[model.layers[4].output])
layers = layer_output([X_train[0:show_size]])[0]
for img_index, filters in enumerate(layers, start=1):
for filter_index, mat in enumerate(filters):
pos = (filter_index)*show_size+img_index
mat_size = mat.shape[1]
draw_color(mat, nb_filter, show_size, pos)
plt.show()
In language and voice processing we have neural networks with time series data. Techniques such as Embeddings and RNN can be used here.
In this example we use movie review data. The input is a serie of words and the output should be 0 or 1 (binary). Each review has a maximum of 140 words and we limit the maximum number of features to 20,000 words. The language preprocessing functions are provided by Keras.
max_features = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
maxlen=140
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
nb_epoch = 3
A embedding layer turns positive integers (indexes) into dense vectors of fixed size. e.g. $[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]$ (for example, a word converted to a numerical value). According to the model definition the embedding layer converts each word in the input review sentences (up to 140 words) into a 100 dimensional word vector (output_dim=100).
model = Sequential()
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 19999 (vocabulary size).
# now model.output_shape == (None, 140, 100), where None is the batch dimension.
model.add(Embedding(input_dim=max_features, output_dim=100, init='glorot_uniform', input_length=maxlen))
model.add(GRU(20, return_sequences=False))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
x = X_train[0:2]
print("Inputs shape", x.shape)
embedding_layer = K.function([model.layers[0].input],[model.layers[0].output])
print("Outputs shape", embedding_layer([x])[0].shape)
res_gru_1 = model.fit(X_train, y_train, batch_size=256, nb_epoch=nb_epoch, validation_data=(X_test, y_test))
In Keras there're 3 types of RNN:
SimpleRNN: Fully-connected RNN where the output is to be fed back to input.GRU: Gated Recurrent UnitLSTM: Long-Short Term Memory unitInput of RNN :
Output of RNN:
return_state: a list of tensors. The first tensor is the output. The remaining tensors are the last states, each with shape $(batch_size, units)$return_sequences: 3D tensor with shape $(batch_size, timesteps, units)$return_sequences=True to feed the subsequent recurrent layer.You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.
Differences between GRU and LSTM
# as the first layer in a Sequential model
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=100, init='glorot_uniform', input_length=maxlen))
model.add(GRU(20, return_sequences=True))
model.add(GRU(20, return_sequences=False))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
res_gru_2 = model.fit(X_train, y_train, batch_size=256, nb_epoch=nb_epoch, validation_data=(X_test, y_test))
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=100, init='glorot_uniform', input_length=maxlen))
model.add(LSTM(20, return_sequences=False))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
res_lstm_1 = model.fit(X_train, y_train, batch_size=256, nb_epoch=nb_epoch,
validation_data=(X_test, y_test))
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
x = range(nb_epoch)
plt.plot(x, res_gru_1.history['acc'], label="GRU 1 train")
plt.plot(x, res_gru_1.history['val_acc'], label="GRU 1 val")
plt.plot(x, res_gru_2.history['acc'], label="GRU 2 train")
plt.plot(x, res_gru_2.history['val_acc'], label="GRU 2 val")
plt.plot(x, res_lstm_1.history['acc'], label="LSTM train")
plt.plot(x, res_lstm_1.history['val_acc'], label="LSTM val")
plt.title("binary accuracy")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
plt.plot(x, res_gru_1.history['loss'], label="GRU 1 train")
plt.plot(x, res_gru_1.history['val_loss'], label="GRU 1 val")
plt.plot(x, res_gru_2.history['loss'], label="GRU 2 train")
plt.plot(x, res_gru_2.history['val_loss'], label="GRU 2 val")
plt.plot(x, res_lstm_1.history['loss'], label="LSTM train")
plt.plot(x, res_lstm_1.history['val_loss'], label="LSTM val")
plt.title("binary loss")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
Markov models predict next words from previous word series.
Parts of Alice's Adventures in Wonderland are used as dataset.
get_file: downloads the specified file into the datasets directory ~/.keras/datasets/Tokenizer: convenient class to pre-process the text data. A word sequence is converted into a sequence of numeric values (word indexes).path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")
doc = open(path).readlines()[0:50]
tokenizer = Tokenizer()
# trains a list of texts
tokenizer.fit_on_texts(doc)
# list of sequences (one per text input)
doc = tokenizer.texts_to_sequences(doc)
doc = [l for l in doc if len(l) > 1]
words_size = sum([len(words) - 1 for words in doc])
maxlen = max([len(x)-1 for x in doc])
# word_index: dictionary mapping words (str) to their rank/index (int).
vocab_size = len(tokenizer.word_index)+1
Learning to predict the next word from the previous sequence:
For large datasets, the Python generator may have serious memory shortage issues. The best way to fit large data is writing a custom generator in Keras yielding (input sequences, labels) with samples in size of batch and use the fit_generator or train_on_batch mechanisms.
def generate_data(X, maxlen, V):
for sentence in X:
inputs = []
targets = []
for i in range(1, len(sentence)):
inputs.append(sentence[0:i])
targets.append(sentence[i])
y = np_utils.to_categorical(targets, V)
inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)
yield (inputs_sequence, y)
nb_units = 64
model = Sequential()
model.add(Embedding(vocab_size, nb_units, input_length=maxlen))
model.add(LSTM(nb_units, return_sequences=False))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
For each epoch, a new sentence is added to the words that were predicted. This repeats until the maximum feature length is reached. The sample function defines the sampling from a multinomial distribution in accordance with the predicted distribution of words.
def sample(p):
p /= sum(p)
return np.where(np.random.multinomial(1,p,1)==1)[1][0]
for i in range(19):
for x, y in generate_data(doc, maxlen, vocab_size):
model.train_on_batch(x, y)
in_words = "alice's"
for _ in range(maxlen):
in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
wordid = sample(model.predict(in_sequence)[0])
for k, v in tokenizer.word_index.items():
if v == wordid:
in_words += " " + k
break
print(i, in_words)
Text generation based on the maximum likelihood of the connected words.
in_words = "alice's"
for _ in range(maxlen):
in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
wordid = model.predict_classes(in_sequence, verbose=0)[0]
for k, v in tokenizer.word_index.items():
if v == wordid:
in_words += " " + k
break
print(in_words)
Similar to the previous example, the next model translates Japanese word sequences into English sequences.
As input data we use extracts from the book The Yellow Face which provides line-by-line Japanese-English translations. It contains morphological analysis for Japanese already. Beginning and end of a sentence are marked with 'GOS' and 'EOS' respectively to the English sentences.
filters = base_filter() + "「」・。、()?! '"
# ja data
ja_docs = open("data/yellow.ja.txt").readlines()[0:50]
ja_tokenizer = Tokenizer(filters=filters)
ja_tokenizer.fit_on_texts(ja_docs)
ja_docs = ja_tokenizer.texts_to_sequences(ja_docs)
# en data
en_docs = open("data/yellow.en.txt").readlines()
en_docs = list(map(lambda x: "GOS " + x + " EOS" , en_docs))[0:50]
en_tokenizer = Tokenizer(filters=filters)
en_tokenizer.fit_on_texts(en_docs)
en_tokenizer.fit_on_texts(en_docs)
en_docs = en_tokenizer.texts_to_sequences(en_docs)
encoder_maxlen = max([len(x) for x in ja_docs])
decoder_maxlen = max([len(x)-1 for x in en_docs])
encoder_vocab_size = len(ja_tokenizer.word_index)+1
decoder_vocab_size = len(en_tokenizer.word_index)+1
def gen_training_data(X, Y, encoder_maxlen, decoder_maxlen, V, samples_size):
encoder_inputs = []
decoder_inputs = []
next_words = []
for i in range(len(X)):
x_doc, y_doc = X[i], Y[i]
for j in range(1, len(y_doc)):
decoder_inputs.append(y_doc[0:j])
next_words.append(y_doc[j])
encoder_inputs.append(x_doc)
if len(next_words) == samples_size:
labels = np_utils.to_categorical(next_words, V)
encoder_inputs = sequence.pad_sequences(encoder_inputs, maxlen=encoder_maxlen)
decoder_inputs = sequence.pad_sequences(decoder_inputs, maxlen=decoder_maxlen)
yield ([encoder_inputs, decoder_inputs], labels)
encoder_inputs = []
decoder_inputs = []
next_words = []
labels = np_utils.to_categorical(next_words, V)
encoder_inputs = sequence.pad_sequences(encoder_inputs, maxlen=encoder_maxlen)
decoder_inputs = sequence.pad_sequences(decoder_inputs, maxlen=decoder_maxlen)
yield ([encoder_inputs, decoder_inputs], labels)
The neural network language model is contructed using a combination of encoder and decoder Sequentials.
# encoder one doc to one repeated vector
encoder = Sequential()
encoder.add(Embedding(encoder_vocab_size, 128, input_length=encoder_maxlen))
encoder.add(GRU(128, return_sequences=False))
encoder.add(RepeatVector(decoder_maxlen))
# decoder inputs
decoder_input = Sequential()
decoder_input.add(Embedding(decoder_vocab_size, 128, input_length=decoder_maxlen))
decoder_input.add(GRU(output_dim=128, return_sequences=True))
decoder_input.add(TimeDistributed(Dense(128)))
model = Sequential()
model.add(Merge([encoder, decoder_input], mode='concat', concat_axis=-1))
model.add(GRU(128, return_sequences=False))
model.add(Dense(decoder_vocab_size))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')
with open('my_model.json', 'w') as f:
json.dump(model.to_json(), f)
print("encoder")
display(SVG(model_to_dot(encoder, show_shapes=True).create(prog='dot', format='svg')))
print("decoder input")
display(SVG(model_to_dot(decoder_input, show_shapes=True).create(prog='dot', format='svg')))
print("merge and decoder output")
display(SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg')))
for i in range(21):
loss = 0.
for x, y in gen_training_data(ja_docs, en_docs, encoder_maxlen, decoder_maxlen, decoder_vocab_size, 256):
loss += model.train_on_batch(x, y)
print(i, "loss ", loss)
if not i % 10:
model.save_weights('my_model_weights_' + str(i) + '.h5')
# load model
model = model_from_json(json.load(open("my_model.json")))
encoder_words = "ホームズ は 答え た"
encoder_in = sequence.pad_sequences(ja_tokenizer.texts_to_sequences([encoder_words]), maxlen=encoder_maxlen)
for i in range(0, 21, 10):
model.load_weights("./my_model_weights_" + str(i) + ".h5")
model.compile(loss='categorical_crossentropy', optimizer='adadelta')
decoder_words = "gos"
for _ in range(encoder_maxlen):
decoder_in = sequence.pad_sequences(en_tokenizer.texts_to_sequences([decoder_words]), maxlen=decoder_maxlen)
wordid = sample(model.predict([encoder_in, decoder_in])[0])
for k, v in en_tokenizer.word_index.items():
if v == wordid:
decoder_words += " " + k
break
if k == "eos":
break
print(i, decoder_words.replace("gos", "").replace("eos", ""))
Using an auto encoder on MNIST handwritten digits.
input_unit_size = 28*28
# Import MNIST data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], input_unit_size)[0:7000]
X_train = X_train.astype('float32')
X_train /= 255
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
inputs = Input(shape=(input_unit_size,))
x = Dense(144, activation='relu')(inputs)
outputs = Dense(input_unit_size)(x)
model = Model(input=inputs, output=outputs)
model.compile(loss='mse', optimizer='adadelta')
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))
model.fit(X_train, X_train, nb_epoch=19, batch_size=256)
show_size = 10
total = 0
plt.figure(figsize=(20,20))
for i in range(show_size):
for j in range(show_size):
draw_digit(X_train[total].reshape(28,28), show_size, show_size, total+1)
total+=1
plt.show()
get_layer_output = K.function([model.layers[0].input],
[model.layers[1].output])
hidden_outputs = get_layer_output([X_train[0:show_size**2]])[0]
total = 0
plt.figure(figsize=(20,20))
for i in range(show_size):
for j in range(show_size):
draw_digit(hidden_outputs[total].reshape(12, 12), show_size, show_size, total+1)
total+=1
plt.show()
get_layer_output = K.function([model.layers[0].input],
[model.layers[2].output])
last_outputs = get_layer_output([X_train[0:show_size**2]])[0]
total = 0
plt.figure(figsize=(20,20))
for i in range(show_size):
for j in range(show_size):
draw_digit(last_outputs[total].reshape(28, 28), show_size, show_size, total+1)
total+=1
plt.show()
Count-based methods compute the statistics of how often some word co-occurs with its neighbor words in a large text corpus, and then map these count-statistics down to a small, dense vector for each word. Predictive models directly try to predict a word from its neighbors in terms of learned small, dense embedding vectors (considered parameters of the model).
A.k.a. Word2vec is a particularly computationally-efficient predictive model for learning word embeddings from raw text. It comes in two flavors:
Algorithmically, these models are similar, except that CBOW predicts target words (e.g. 'mat') from source context words ('the cat sits on the'), while the skip-gram does the inverse and predicts source context-words from the target words.
Extractions from Alice's Adventures in Wonderland.
base_filter()
path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")
corpus = open(path).readlines()[0:500]
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]
tokenizer = Tokenizer(filters=base_filter()+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
V = len(tokenizer.word_index) + 1 # 語彙数
dim = 100 # ベクトルの次元
window_size = 2 # 文脈窓幅
def generate_data(corpus, window_size, V):
maxlen = window_size*2
for words in corpus:
contexts = []
labels = []
L = len(words)
for index, word in enumerate(words):
s = index-window_size
e = index+window_size+1
contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
labels.append(word)
y = np_utils.to_categorical(labels, V)
x = sequence.pad_sequences(contexts, maxlen=maxlen)
yield (x, y)
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, init='glorot_uniform',input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, init='glorot_uniform', activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer="adadelta")
SVG(model_to_dot(cbow, show_shapes=True).create(prog='dot', format='svg'))
for ite in range(10):
loss = 0.
for x, y in generate_data(corpus, window_size, V):
loss += cbow.train_on_batch(x, y)
print(ite, loss)
word2vec stores the word vectors in the same format as the original word vectors.
f = open('vectors.txt' ,'w')
f.write(" ".join([str(V-1), str(dim)]))
f.write("\n")
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
The example below shows the close words to alice using distances between the words.
w2v = Word2Vec.load_word2vec_format('./vectors.txt', binary=False)
w2v.most_similar(positive=['alice'])
Skip-gram is another famous algorithm in Word2vec. It is a generalization of n-grams in which the components (typically words) need not be consecutive in the text under consideration, but may leave gaps that are skipped over
Extractions from Alice's Adventures in Wonderland
base_filter()
path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")
corpus = open(path).readlines()[0:500]
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]
tokenizer = Tokenizer(filters=base_filter()+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
V = len(tokenizer.word_index) + 1 # 語彙数
dim = 100 # 100-dimensional vector
window_size = 2 # context window size
def generate_data(corpus, window_size, V):
maxlen = window_size*2
for words in corpus:
L = len(words)
for index, word in enumerate(words):
s = index-window_size
e = index+window_size+1
in_words = []
labels = []
for i in range(s, e):
if i != index and 0 <= i < L:
in_words.append([word] )
labels.append(words[i])
x = np.array(in_words,dtype=np.int32)
y = np_utils.to_categorical(labels, V)
yield (x, y)
skipgram = Sequential()
skipgram.add(Embedding(input_dim=V, output_dim=dim, init='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim, )))
skipgram.add(Dense(input_dim=dim, output_dim=V, activation='softmax'))
skipgram.compile(loss='categorical_crossentropy', optimizer="adadelta")
SVG(model_to_dot(skipgram, show_shapes=True).create(prog='dot', format='svg'))
for ite in range(10):
print(ite)
for x, y in generate_data(corpus, window_size, V):
skipgram.train_on_batch(x, y)
f = open('vectors.txt' ,'w')
f.write(" ".join([str(V-1),str(dim)]))
f.write("\n")
vectors = skipgram.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write(word)
f.write(" ")
f.write(" ".join(map(str, list(vectors[i,:]))))
f.write("\n")
f.close()
w2v = Word2Vec.load_word2vec_format('./vectors.txt', binary=False)
w2v.most_similar(positive=['alice'])
Keras-related resources: